/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.flink.yarn;

import org.apache.flink.api.common.JobID;
import org.apache.flink.api.common.JobSubmissionResult;
import org.apache.flink.api.common.time.Deadline;
import org.apache.flink.client.deployment.ClusterDeploymentException;
import org.apache.flink.client.deployment.ClusterSpecification;
import org.apache.flink.client.program.ClusterClient;
import org.apache.flink.client.program.rest.RestClusterClient;
import org.apache.flink.configuration.ConfigConstants;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.configuration.HighAvailabilityOptions;
import org.apache.flink.configuration.ResourceManagerOptions;
import org.apache.flink.runtime.execution.ExecutionState;
import org.apache.flink.runtime.jobgraph.JobGraph;
import org.apache.flink.runtime.jobmaster.JobResult;
import org.apache.flink.runtime.rest.messages.EmptyRequestBody;
import org.apache.flink.runtime.rest.messages.job.JobDetailsInfo;
import org.apache.flink.runtime.rest.messages.job.metrics.JobMetricsHeaders;
import org.apache.flink.runtime.rest.messages.job.metrics.JobMetricsMessageParameters;
import org.apache.flink.runtime.rest.messages.job.metrics.Metric;
import org.apache.flink.runtime.testutils.CommonTestUtils;
import org.apache.flink.util.OperatingSystem;
import org.apache.flink.yarn.configuration.YarnConfigOptions;
import org.apache.flink.yarn.entrypoint.YarnSessionClusterEntrypoint;
import org.apache.flink.yarn.testjob.YarnTestJob;
import org.apache.flink.yarn.util.YarnTestUtils;

import org.apache.flink.shaded.guava18.com.google.common.collect.Iterables;

import org.apache.commons.lang3.StringUtils;
import org.apache.curator.test.TestingServer;
import org.apache.hadoop.security.UserGroupInformation;
import org.apache.hadoop.yarn.api.protocolrecords.StopContainersRequest;
import org.apache.hadoop.yarn.api.records.ApplicationId;
import org.apache.hadoop.yarn.api.records.ApplicationReport;
import org.apache.hadoop.yarn.api.records.ContainerId;
import org.apache.hadoop.yarn.api.records.YarnApplicationState;
import org.apache.hadoop.yarn.client.api.YarnClient;
import org.apache.hadoop.yarn.conf.YarnConfiguration;
import org.apache.hadoop.yarn.security.NMTokenIdentifier;
import org.apache.hadoop.yarn.server.nodemanager.NodeManager;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container;
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.ResourceScheduler;
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.CapacityScheduler;
import org.junit.AfterClass;
import org.junit.Before;
import org.junit.BeforeClass;
import org.junit.ClassRule;
import org.junit.Test;
import org.junit.rules.TemporaryFolder;

import javax.annotation.Nonnull;

import java.io.File;
import java.io.IOException;
import java.time.Duration;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.EnumSet;
import java.util.Map;
import java.util.Optional;
import java.util.concurrent.CompletableFuture;
import java.util.concurrent.ConcurrentMap;
import java.util.concurrent.TimeUnit;
import java.util.function.Function;
import java.util.function.Predicate;

import static org.apache.flink.util.Preconditions.checkState;
import static org.hamcrest.Matchers.instanceOf;
import static org.hamcrest.Matchers.is;
import static org.hamcrest.Matchers.notNullValue;
import static org.junit.Assert.assertNotNull;
import static org.junit.Assert.assertThat;
import static org.junit.Assume.assumeTrue;

/**
 * Tests that verify correct HA behavior.
 */
public class YARNHighAvailabilityITCase extends YarnTestBase {

	@ClassRule
	public static final TemporaryFolder FOLDER = new TemporaryFolder();

	private static final String LOG_DIR = "flink-yarn-tests-ha";
	private static final Duration TIMEOUT = Duration.ofSeconds(200L);

	private static TestingServer zkServer;
	private static String storageDir;

	private YarnTestJob.StopJobSignal stopJobSignal;
	private JobGraph job;

	@BeforeClass
	public static void setup() throws Exception {
		zkServer = new TestingServer();

		storageDir = FOLDER.newFolder().getAbsolutePath();

		// startYARNWithConfig should be implemented by subclass
		YARN_CONFIGURATION.setClass(YarnConfiguration.RM_SCHEDULER, CapacityScheduler.class, ResourceScheduler.class);
		YARN_CONFIGURATION.set(YarnTestBase.TEST_CLUSTER_NAME_KEY, LOG_DIR);
		YARN_CONFIGURATION.setInt(YarnConfiguration.NM_PMEM_MB, 4096);
		startYARNWithConfig(YARN_CONFIGURATION);
	}

	@AfterClass
	public static void teardown() throws Exception {
		if (zkServer != null) {
			zkServer.stop();
			zkServer = null;
		}
	}

	@Before
	public void setUp() throws Exception {
		initJobGraph();
	}

	private void initJobGraph() throws IOException {
		stopJobSignal = YarnTestJob.StopJobSignal.usingMarkerFile(FOLDER.newFile().toPath());
		job = YarnTestJob.stoppableJob(stopJobSignal);
		final File testingJar =
			YarnTestBase.findFile("..", new YarnTestUtils.TestJarFinder("flink-yarn-tests"));

		assertThat(testingJar, notNullValue());

		job.addJar(new org.apache.flink.core.fs.Path(testingJar.toURI()));
	}

	/**
	 * Tests that Yarn will restart a killed {@link YarnSessionClusterEntrypoint} which will then resume
	 * a persisted {@link JobGraph}.
	 */
	@Test
	public void testKillYarnSessionClusterEntrypoint() throws Exception {
		runTest(() -> {
			assumeTrue(
				"This test kills processes via the pkill command. Thus, it only runs on Linux, Mac OS, Free BSD and Solaris.",
				OperatingSystem.isLinux() || OperatingSystem.isMac() || OperatingSystem.isFreeBSD() || OperatingSystem.isSolaris());

			final YarnClusterDescriptor yarnClusterDescriptor = setupYarnClusterDescriptor();
			yarnClusterDescriptor.addShipFiles(Arrays.asList(flinkShadedHadoopDir.listFiles()));

			final RestClusterClient<ApplicationId> restClusterClient = deploySessionCluster(yarnClusterDescriptor);

			try {
				final JobID jobId = submitJob(restClusterClient);
				final ApplicationId id = restClusterClient.getClusterId();

				waitUntilJobIsRunning(restClusterClient, jobId);

				killApplicationMaster(yarnClusterDescriptor.getYarnSessionClusterEntrypoint());
				waitForApplicationAttempt(id, 2);

				waitForJobTermination(restClusterClient, jobId);

				killApplicationAndWait(id);
			} finally {
				restClusterClient.shutdown();
			}
		});
	}

	@Test
	public void testJobRecoversAfterKillingTaskManager() throws Exception {
		runTest(() -> {
			final YarnClusterDescriptor yarnClusterDescriptor = setupYarnClusterDescriptor();
			yarnClusterDescriptor.addShipFiles(Arrays.asList(flinkShadedHadoopDir.listFiles()));

			final RestClusterClient<ApplicationId> restClusterClient = deploySessionCluster(yarnClusterDescriptor);
			try {
				final JobID jobId = submitJob(restClusterClient);
				waitUntilJobIsRunning(restClusterClient, jobId);

				stopTaskManagerContainer();
				waitUntilJobIsRestarted(restClusterClient, jobId, 1);

				waitForJobTermination(restClusterClient, jobId);

				killApplicationAndWait(restClusterClient.getClusterId());
			} finally {
				restClusterClient.shutdown();
			}
		});
	}

	private void waitForApplicationAttempt(final ApplicationId applicationId, final int attemptId) throws Exception {
		final YarnClient yarnClient = getYarnClient();
		checkState(yarnClient != null, "yarnClient must be initialized");

		CommonTestUtils.waitUntilCondition(() -> {
			final ApplicationReport applicationReport = yarnClient.getApplicationReport(applicationId);
			return applicationReport.getCurrentApplicationAttemptId().getAttemptId() >= attemptId;
		}, Deadline.fromNow(TIMEOUT));
	}

	/**
	 * Stops a container running {@link YarnTaskExecutorRunner}.
	 */
	private void stopTaskManagerContainer() throws Exception {
		// find container id of taskManager:
		ContainerId taskManagerContainer = null;
		NodeManager nodeManager = null;
		NMTokenIdentifier nmIdent = null;
		UserGroupInformation remoteUgi = UserGroupInformation.getCurrentUser();

		for (int nmId = 0; nmId < NUM_NODEMANAGERS; nmId++) {
			NodeManager nm = yarnCluster.getNodeManager(nmId);
			ConcurrentMap<ContainerId, Container> containers = nm.getNMContext().getContainers();
			for (Map.Entry<ContainerId, Container> entry : containers.entrySet()) {
				String command = StringUtils.join(entry.getValue().getLaunchContext().getCommands(), " ");
				if (command.contains(YarnTaskExecutorRunner.class.getSimpleName())) {
					taskManagerContainer = entry.getKey();
					nodeManager = nm;
					nmIdent = new NMTokenIdentifier(taskManagerContainer.getApplicationAttemptId(), null, "", 0);
					// allow myself to do stuff with the container
					// remoteUgi.addCredentials(entry.getValue().getCredentials());
					remoteUgi.addTokenIdentifier(nmIdent);
				}
			}
		}

		assertNotNull("Unable to find container with TaskManager", taskManagerContainer);
		assertNotNull("Illegal state", nodeManager);

		StopContainersRequest scr = StopContainersRequest.newInstance(Collections.singletonList(taskManagerContainer));

		nodeManager.getNMContext().getContainerManager().stopContainers(scr);

		// cleanup auth for the subsequent tests.
		remoteUgi.getTokenIdentifiers().remove(nmIdent);
	}

	private void killApplicationAndWait(final ApplicationId id) throws Exception {
		final YarnClient yarnClient = getYarnClient();
		checkState(yarnClient != null, "yarnClient must be initialized");

		yarnClient.killApplication(id);

		CommonTestUtils.waitUntilCondition(() -> !yarnClient.getApplications(EnumSet.of(YarnApplicationState.KILLED, YarnApplicationState.FINISHED)).isEmpty(),
			Deadline.fromNow(TIMEOUT));
	}

	private void waitForJobTermination(
			final RestClusterClient<ApplicationId> restClusterClient,
			final JobID jobId) throws Exception {
		stopJobSignal.signal();
		final CompletableFuture<JobResult> jobResult = restClusterClient.requestJobResult(jobId);
		jobResult.get(TIMEOUT.toMillis(), TimeUnit.MILLISECONDS);
	}

	@Nonnull
	private YarnClusterDescriptor setupYarnClusterDescriptor() {
		final Configuration flinkConfiguration = new Configuration();
		flinkConfiguration.setString(YarnConfigOptions.APPLICATION_ATTEMPTS, "10");
		flinkConfiguration.setString(HighAvailabilityOptions.HA_MODE, "zookeeper");
		flinkConfiguration.setString(HighAvailabilityOptions.HA_STORAGE_PATH, storageDir);
		flinkConfiguration.setString(HighAvailabilityOptions.HA_ZOOKEEPER_QUORUM, zkServer.getConnectString());
		flinkConfiguration.setInteger(HighAvailabilityOptions.ZOOKEEPER_SESSION_TIMEOUT, 1000);

		flinkConfiguration.setString(ConfigConstants.RESTART_STRATEGY, "fixed-delay");
		flinkConfiguration.setInteger(ConfigConstants.RESTART_STRATEGY_FIXED_DELAY_ATTEMPTS, Integer.MAX_VALUE);

		final int minMemory = 100;
		flinkConfiguration.setInteger(ResourceManagerOptions.CONTAINERIZED_HEAP_CUTOFF_MIN, minMemory);

		return createYarnClusterDescriptor(flinkConfiguration);
	}

	private RestClusterClient<ApplicationId> deploySessionCluster(YarnClusterDescriptor yarnClusterDescriptor) throws ClusterDeploymentException {
		final int containerMemory = 256;
		final ClusterClient<ApplicationId> yarnClusterClient = yarnClusterDescriptor.deploySessionCluster(
			new ClusterSpecification.ClusterSpecificationBuilder()
				.setMasterMemoryMB(containerMemory)
				.setTaskManagerMemoryMB(containerMemory)
				.setSlotsPerTaskManager(1)
				.createClusterSpecification());

		assertThat(yarnClusterClient, is(instanceOf(RestClusterClient.class)));
		return (RestClusterClient<ApplicationId>) yarnClusterClient;
	}

	private JobID submitJob(RestClusterClient<ApplicationId> restClusterClient) throws InterruptedException, java.util.concurrent.ExecutionException {
		final CompletableFuture<JobSubmissionResult> jobSubmissionResultCompletableFuture =
			restClusterClient.submitJob(job);

		final JobSubmissionResult jobSubmissionResult = jobSubmissionResultCompletableFuture.get();
		return jobSubmissionResult.getJobID();
	}

	private void killApplicationMaster(final String processName) throws IOException, InterruptedException {
		final Process exec = Runtime.getRuntime().exec("pkill -f " + processName);
		assertThat(exec.waitFor(), is(0));
	}

	private static void waitUntilJobIsRunning(RestClusterClient<ApplicationId> restClusterClient, JobID jobId) throws Exception {
		CommonTestUtils.waitUntilCondition(
			() -> {
				final JobDetailsInfo jobDetails = restClusterClient.getJobDetails(jobId).get();
				return jobDetails.getJobVertexInfos()
					.stream()
					.map(toExecutionState())
					.allMatch(isRunning());
			},
			Deadline.fromNow(TIMEOUT));
	}

	private static Function<JobDetailsInfo.JobVertexDetailsInfo, ExecutionState> toExecutionState() {
		return JobDetailsInfo.JobVertexDetailsInfo::getExecutionState;
	}

	private static Predicate<ExecutionState> isRunning() {
		return executionState -> executionState == ExecutionState.RUNNING;
	}

	private static void waitUntilJobIsRestarted(
		final RestClusterClient<ApplicationId> restClusterClient,
		final JobID jobId,
		final int expectedFullRestarts) throws Exception {
		CommonTestUtils.waitUntilCondition(
			() -> getJobFullRestarts(restClusterClient, jobId) >= expectedFullRestarts,
			Deadline.fromNow(TIMEOUT));
	}

	private static int getJobFullRestarts(
		final RestClusterClient<ApplicationId> restClusterClient,
		final JobID jobId) throws Exception {

		return getJobMetric(restClusterClient, jobId, "fullRestarts")
			.map(Metric::getValue)
			.map(Integer::parseInt)
			.orElse(0);
	}

	private static Optional<Metric> getJobMetric(
		final RestClusterClient<ApplicationId> restClusterClient,
		final JobID jobId,
		final String metricName) throws Exception {

		final JobMetricsMessageParameters messageParameters = new JobMetricsMessageParameters();
		messageParameters.jobPathParameter.resolve(jobId);
		messageParameters.metricsFilterParameter.resolveFromString(metricName);

		final Collection<Metric> metrics = restClusterClient.sendRequest(
			JobMetricsHeaders.getInstance(),
			messageParameters,
			EmptyRequestBody.getInstance()).get().getMetrics();

		final Metric metric = Iterables.getOnlyElement(metrics, null);
		checkState(metric == null || metric.getId().equals(metricName));
		return Optional.ofNullable(metric);
	}
}
